{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "0e6878b7-26fa-4813-9b36-6dec20d11346",
   "metadata": {},
   "source": [
    "# Aggregating Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "dd64b9b2-3b7d-403d-aae9-5db682368086",
   "metadata": {},
   "outputs": [],
   "source": [
    "import polars as pl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a6fc4149-3f32-477f-ae54-f529aea7c9d9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><style>\n",
       ".dataframe > thead > tr,\n",
       ".dataframe > tbody > tr {\n",
       "  text-align: right;\n",
       "  white-space: pre-wrap;\n",
       "}\n",
       "</style>\n",
       "<small>shape: (1, 3)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>median_result</th><th>most_frequent_result</th><th>variance</th></tr><tr><td>f64</td><td>i64</td><td>f64</td></tr></thead><tbody><tr><td>11.0</td><td>10</td><td>20.989616</td></tr></tbody></table></div>"
      ],
      "text/plain": [
       "shape: (1, 3)\n",
       "┌───────────────┬──────────────────────┬───────────┐\n",
       "│ median_result ┆ most_frequent_result ┆ variance  │\n",
       "│ ---           ┆ ---                  ┆ ---       │\n",
       "│ f64           ┆ i64                  ┆ f64       │\n",
       "╞═══════════════╪══════════════════════╪═══════════╡\n",
       "│ 11.0          ┆ 10                   ┆ 20.989616 │\n",
       "└───────────────┴──────────────────────┴───────────┘"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "math_students = pl.read_parquet(\"math.parquet\")\n",
    "\n",
    "(\n",
    "    math_students.select(\n",
    "        median_result=pl.col(\"G3\").median(),\n",
    "        most_frequent_result=pl.col(\"G3\").mode(),\n",
    "        variance=pl.col(\"G3\").var(),\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f9f40f57-c895-44b1-bad0-22aa8aafaeea",
   "metadata": {},
   "source": [
    "# Grouping Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "c108a17a-d4d9-4183-b0eb-58c6abe70a7a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><style>\n",
       ".dataframe > thead > tr,\n",
       ".dataframe > tbody > tr {\n",
       "  text-align: right;\n",
       "  white-space: pre-wrap;\n",
       "}\n",
       "</style>\n",
       "<small>shape: (4, 5)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>subject</th><th>internet</th><th>total</th><th>passes</th><th>percentage</th></tr><tr><td>str</td><td>str</td><td>u32</td><td>u32</td><td>str</td></tr></thead><tbody><tr><td>&quot;M&quot;</td><td>&quot;yes&quot;</td><td>329</td><td>117</td><td>&quot;35.56%&quot;</td></tr><tr><td>&quot;M&quot;</td><td>&quot;no&quot;</td><td>66</td><td>14</td><td>&quot;21.21%&quot;</td></tr><tr><td>&quot;P&quot;</td><td>&quot;yes&quot;</td><td>498</td><td>231</td><td>&quot;46.39%&quot;</td></tr><tr><td>&quot;P&quot;</td><td>&quot;no&quot;</td><td>151</td><td>45</td><td>&quot;29.8%&quot;</td></tr></tbody></table></div>"
      ],
      "text/plain": [
       "shape: (4, 5)\n",
       "┌─────────┬──────────┬───────┬────────┬────────────┐\n",
       "│ subject ┆ internet ┆ total ┆ passes ┆ percentage │\n",
       "│ ---     ┆ ---      ┆ ---   ┆ ---    ┆ ---        │\n",
       "│ str     ┆ str      ┆ u32   ┆ u32    ┆ str        │\n",
       "╞═════════╪══════════╪═══════╪════════╪════════════╡\n",
       "│ M       ┆ yes      ┆ 329   ┆ 117    ┆ 35.56%     │\n",
       "│ M       ┆ no       ┆ 66    ┆ 14     ┆ 21.21%     │\n",
       "│ P       ┆ yes      ┆ 498   ┆ 231    ┆ 46.39%     │\n",
       "│ P       ┆ no       ┆ 151   ┆ 45     ┆ 29.8%      │\n",
       "└─────────┴──────────┴───────┴────────┴────────────┘"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import polars as pl\n",
    "\n",
    "all_students = pl.read_parquet(\"course.parquet\")\n",
    "\n",
    "(\n",
    "    all_students.group_by([\"subject\", \"internet\"])\n",
    "    .agg(\n",
    "        total=pl.col(\"student_id\").count(),\n",
    "        passes=pl.col(\"G3\").filter(pl.col(\"G3\") > 12).count(),\n",
    "    )\n",
    "    .select(\n",
    "        pl.col(\"subject\", \"internet\", \"total\", \"passes\"),\n",
    "        percentage=pl.format(\n",
    "            \"{}%\", (pl.col(\"passes\") * 100 / pl.col(\"total\")).round(2)\n",
    "        ),\n",
    "    )\n",
    "    .sort(\"subject\")\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bc485940-3f73-4c50-9a7e-221d8c6df070",
   "metadata": {},
   "source": [
    "# Grouping Time Series Data With .group_by_dynamic()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "4082c2e0-b30f-4a08-940f-9744a6d0a1b3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><style>\n",
       ".dataframe > thead > tr,\n",
       ".dataframe > tbody > tr {\n",
       "  text-align: right;\n",
       "  white-space: pre-wrap;\n",
       "}\n",
       "</style>\n",
       "<small>shape: (18, 3)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>lecturer_initials</th><th>class_start</th><th>absences</th></tr><tr><td>str</td><td>datetime[μs]</td><td>f64</td></tr></thead><tbody><tr><td>&quot;DH&quot;</td><td>2024-01-01 00:00:00</td><td>4.0</td></tr><tr><td>&quot;DH&quot;</td><td>2024-02-01 00:00:00</td><td>3.4</td></tr><tr><td>&quot;DH&quot;</td><td>2024-03-01 00:00:00</td><td>4.25</td></tr><tr><td>&quot;DH&quot;</td><td>2024-04-01 00:00:00</td><td>2.5</td></tr><tr><td>&quot;DH&quot;</td><td>2024-05-01 00:00:00</td><td>2.0</td></tr><tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr><tr><td>&quot;LM&quot;</td><td>2024-02-01 00:00:00</td><td>3.0</td></tr><tr><td>&quot;LM&quot;</td><td>2024-03-01 00:00:00</td><td>3.4</td></tr><tr><td>&quot;LM&quot;</td><td>2024-04-01 00:00:00</td><td>3.8</td></tr><tr><td>&quot;LM&quot;</td><td>2024-05-01 00:00:00</td><td>3.666667</td></tr><tr><td>&quot;LM&quot;</td><td>2024-06-01 00:00:00</td><td>4.6</td></tr></tbody></table></div>"
      ],
      "text/plain": [
       "shape: (18, 3)\n",
       "┌───────────────────┬─────────────────────┬──────────┐\n",
       "│ lecturer_initials ┆ class_start         ┆ absences │\n",
       "│ ---               ┆ ---                 ┆ ---      │\n",
       "│ str               ┆ datetime[μs]        ┆ f64      │\n",
       "╞═══════════════════╪═════════════════════╪══════════╡\n",
       "│ DH                ┆ 2024-01-01 00:00:00 ┆ 4.0      │\n",
       "│ DH                ┆ 2024-02-01 00:00:00 ┆ 3.4      │\n",
       "│ DH                ┆ 2024-03-01 00:00:00 ┆ 4.25     │\n",
       "│ DH                ┆ 2024-04-01 00:00:00 ┆ 2.5      │\n",
       "│ DH                ┆ 2024-05-01 00:00:00 ┆ 2.0      │\n",
       "│ …                 ┆ …                   ┆ …        │\n",
       "│ LM                ┆ 2024-02-01 00:00:00 ┆ 3.0      │\n",
       "│ LM                ┆ 2024-03-01 00:00:00 ┆ 3.4      │\n",
       "│ LM                ┆ 2024-04-01 00:00:00 ┆ 3.8      │\n",
       "│ LM                ┆ 2024-05-01 00:00:00 ┆ 3.666667 │\n",
       "│ LM                ┆ 2024-06-01 00:00:00 ┆ 4.6      │\n",
       "└───────────────────┴─────────────────────┴──────────┘"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "math_attendance = pl.read_parquet(\"math_classes.parquet\")\n",
    "\n",
    "(\n",
    "    math_attendance.group_by_dynamic(\n",
    "        index_column=\"class_start\",\n",
    "        every=\"1mo\",\n",
    "        closed=\"both\",\n",
    "        group_by=\"lecturer_initials\",\n",
    "    ).agg(pl.col(\"absences\").mean())\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a72b3a67-0799-47ae-965c-843425593464",
   "metadata": {},
   "source": [
    "## Grouping and Aggregating Using Window Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "f9ba580b-f724-45f1-a31e-f95cb4612564",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><style>\n",
       ".dataframe > thead > tr,\n",
       ".dataframe > tbody > tr {\n",
       "  text-align: right;\n",
       "  white-space: pre-wrap;\n",
       "}\n",
       "</style>\n",
       "<small>shape: (452, 6)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>subject</th><th>reason</th><th>G1</th><th>mean_G1</th><th>G2</th><th>mean_G2</th></tr><tr><td>str</td><td>str</td><td>i64</td><td>f64</td><td>i64</td><td>f64</td></tr></thead><tbody><tr><td>&quot;M&quot;</td><td>&quot;home&quot;</td><td>15</td><td>10.816514</td><td>14</td><td>10.743119</td></tr><tr><td>&quot;M&quot;</td><td>&quot;reputation&quot;</td><td>15</td><td>11.457143</td><td>15</td><td>11.257143</td></tr><tr><td>&quot;M&quot;</td><td>&quot;home&quot;</td><td>12</td><td>10.816514</td><td>12</td><td>10.743119</td></tr><tr><td>&quot;M&quot;</td><td>&quot;home&quot;</td><td>16</td><td>10.816514</td><td>18</td><td>10.743119</td></tr><tr><td>&quot;M&quot;</td><td>&quot;home&quot;</td><td>14</td><td>10.816514</td><td>15</td><td>10.743119</td></tr><tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr><tr><td>&quot;P&quot;</td><td>&quot;home&quot;</td><td>17</td><td>11.657718</td><td>18</td><td>11.785235</td></tr><tr><td>&quot;P&quot;</td><td>&quot;home&quot;</td><td>14</td><td>11.657718</td><td>15</td><td>11.785235</td></tr><tr><td>&quot;P&quot;</td><td>&quot;other&quot;</td><td>14</td><td>10.694444</td><td>17</td><td>10.777778</td></tr><tr><td>&quot;P&quot;</td><td>&quot;course&quot;</td><td>15</td><td>10.982456</td><td>15</td><td>11.147368</td></tr><tr><td>&quot;P&quot;</td><td>&quot;course&quot;</td><td>11</td><td>10.982456</td><td>12</td><td>11.147368</td></tr></tbody></table></div>"
      ],
      "text/plain": [
       "shape: (452, 6)\n",
       "┌─────────┬────────────┬─────┬───────────┬─────┬───────────┐\n",
       "│ subject ┆ reason     ┆ G1  ┆ mean_G1   ┆ G2  ┆ mean_G2   │\n",
       "│ ---     ┆ ---        ┆ --- ┆ ---       ┆ --- ┆ ---       │\n",
       "│ str     ┆ str        ┆ i64 ┆ f64       ┆ i64 ┆ f64       │\n",
       "╞═════════╪════════════╪═════╪═══════════╪═════╪═══════════╡\n",
       "│ M       ┆ home       ┆ 15  ┆ 10.816514 ┆ 14  ┆ 10.743119 │\n",
       "│ M       ┆ reputation ┆ 15  ┆ 11.457143 ┆ 15  ┆ 11.257143 │\n",
       "│ M       ┆ home       ┆ 12  ┆ 10.816514 ┆ 12  ┆ 10.743119 │\n",
       "│ M       ┆ home       ┆ 16  ┆ 10.816514 ┆ 18  ┆ 10.743119 │\n",
       "│ M       ┆ home       ┆ 14  ┆ 10.816514 ┆ 15  ┆ 10.743119 │\n",
       "│ …       ┆ …          ┆ …   ┆ …         ┆ …   ┆ …         │\n",
       "│ P       ┆ home       ┆ 17  ┆ 11.657718 ┆ 18  ┆ 11.785235 │\n",
       "│ P       ┆ home       ┆ 14  ┆ 11.657718 ┆ 15  ┆ 11.785235 │\n",
       "│ P       ┆ other      ┆ 14  ┆ 10.694444 ┆ 17  ┆ 10.777778 │\n",
       "│ P       ┆ course     ┆ 15  ┆ 10.982456 ┆ 15  ┆ 11.147368 │\n",
       "│ P       ┆ course     ┆ 11  ┆ 10.982456 ┆ 12  ┆ 11.147368 │\n",
       "└─────────┴────────────┴─────┴───────────┴─────┴───────────┘"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_students = pl.read_parquet(\"course.parquet\")\n",
    "\n",
    "all_students.select(\n",
    "    pl.col(\"subject\", \"reason\", \"G1\"),\n",
    "    mean_G1=pl.col(\"G1\").mean().over(\"subject\", \"reason\"),\n",
    "    G2=pl.col(\"G2\"),\n",
    "    mean_G2=pl.col(\"G2\").mean().over(\"subject\", \"reason\"),\n",
    ").filter(\n",
    "    (pl.col(\"G1\") > pl.col(\"mean_G1\")) & (pl.col(\"G2\") > pl.col(\"mean_G2\"))\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a562f108-2900-4e54-980c-ae6c3dad0fc3",
   "metadata": {},
   "source": [
    "# Grouping and Aggregating Using Pivot Tables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "8708da4d-9b85-4ea4-a108-5612ed001dff",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><style>\n",
       ".dataframe > thead > tr,\n",
       ".dataframe > tbody > tr {\n",
       "  text-align: right;\n",
       "  white-space: pre-wrap;\n",
       "}\n",
       "</style>\n",
       "<small>shape: (4, 6)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>subject</th><th>sex</th><th>G1_GP</th><th>G1_MS</th><th>G2_GP</th><th>G2_MS</th></tr><tr><td>str</td><td>str</td><td>f64</td><td>f64</td><td>f64</td><td>f64</td></tr></thead><tbody><tr><td>&quot;M&quot;</td><td>&quot;F&quot;</td><td>10.579235</td><td>10.92</td><td>10.398907</td><td>10.32</td></tr><tr><td>&quot;M&quot;</td><td>&quot;M&quot;</td><td>11.337349</td><td>10.380952</td><td>11.204819</td><td>10.047619</td></tr><tr><td>&quot;P&quot;</td><td>&quot;F&quot;</td><td>12.28692</td><td>10.582192</td><td>12.50211</td><td>10.719178</td></tr><tr><td>&quot;P&quot;</td><td>&quot;M&quot;</td><td>11.602151</td><td>9.7875</td><td>11.688172</td><td>10.0875</td></tr></tbody></table></div>"
      ],
      "text/plain": [
       "shape: (4, 6)\n",
       "┌─────────┬─────┬───────────┬───────────┬───────────┬───────────┐\n",
       "│ subject ┆ sex ┆ G1_GP     ┆ G1_MS     ┆ G2_GP     ┆ G2_MS     │\n",
       "│ ---     ┆ --- ┆ ---       ┆ ---       ┆ ---       ┆ ---       │\n",
       "│ str     ┆ str ┆ f64       ┆ f64       ┆ f64       ┆ f64       │\n",
       "╞═════════╪═════╪═══════════╪═══════════╪═══════════╪═══════════╡\n",
       "│ M       ┆ F   ┆ 10.579235 ┆ 10.92     ┆ 10.398907 ┆ 10.32     │\n",
       "│ M       ┆ M   ┆ 11.337349 ┆ 10.380952 ┆ 11.204819 ┆ 10.047619 │\n",
       "│ P       ┆ F   ┆ 12.28692  ┆ 10.582192 ┆ 12.50211  ┆ 10.719178 │\n",
       "│ P       ┆ M   ┆ 11.602151 ┆ 9.7875    ┆ 11.688172 ┆ 10.0875   │\n",
       "└─────────┴─────┴───────────┴───────────┴───────────┴───────────┘"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_students = pl.read_parquet(\"course.parquet\")\n",
    "(\n",
    "    all_students.pivot(\n",
    "        on=\"school\",\n",
    "        index=[\"subject\", \"sex\"],\n",
    "        values=[\"G1\", \"G2\"],\n",
    "        aggregate_function=\"mean\",\n",
    "    )\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
